ISLR2 Chapter 8 - Tree-Based Methods

R. J. Serrano

3/28/2022

Tree-Base Methods —

Learning objectives:

Original script source: https://emilhvitfeldt.github.io/ISLR-tidymodels-labs/tree-based-methods.html

Decision Tree Terminology —

Source: https://medium.com/@scid2230/decision-tree-basics-34d864483c42

Decision Trees (Classification) Explained (StatQuest) —

8.1 Fitting Classification Trees —

We will also use the Carseats data set from the ISLR package to demonstrate a classification model.

Carseats
skimr::skim(Carseats)
Data summary
Name Carseats
Number of rows 400
Number of columns 11
_______________________
Column type frequency:
factor 3
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
ShelveLoc 0 1 FALSE 3 Med: 219, Bad: 96, Goo: 85
Urban 0 1 FALSE 2 Yes: 282, No: 118
US 0 1 FALSE 2 Yes: 258, No: 142

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sales 0 1 7.50 2.82 0 5.39 7.49 9.32 16.27 ▁▆▇▃▁
CompPrice 0 1 124.97 15.33 77 115.00 125.00 135.00 175.00 ▁▅▇▃▁
Income 0 1 68.66 27.99 21 42.75 69.00 91.00 120.00 ▇▆▇▆▅
Advertising 0 1 6.64 6.65 0 0.00 5.00 12.00 29.00 ▇▃▃▁▁
Population 0 1 264.84 147.38 10 139.00 272.00 398.50 509.00 ▇▇▇▇▇
Price 0 1 115.79 23.68 24 100.00 117.00 131.00 191.00 ▁▂▇▆▁
Age 0 1 53.32 16.20 25 39.75 54.50 66.00 80.00 ▇▆▇▇▇
Education 0 1 13.90 2.62 10 12.00 14.00 16.00 18.00 ▇▇▃▇▇

We create a new variable High to denote if Sales <= 8, then the Sales predictor is removed as it is a perfect predictor of High.

carseats <- as_tibble(Carseats) %>%
  mutate(High = factor(if_else(Sales <= 8, "No", "Yes"))) %>%
  select(-Sales)

Exploratory Data Analysis (EDA)

Let’s count High

carseats %>% 
     count(High)

High plot

carseats %>% 
     ggplot(aes(High, fill = High)) + 
     geom_bar() + 
     theme(legend.position = 'none')

Correlation Analysis

Correlation (Pearson)

carseats_num <- carseats %>% 
     mutate(High = ifelse(High == "No", 0 , 1), 
            Urban = ifelse(Urban == "No", 0, 1), 
            US = ifelse(US == "No", 0, 1), 
            ShelveLoc = case_when(
                 ShelveLoc == 'Bad' ~ 1, 
                 ShelveLoc == "Medium" ~ 2, 
                 TRUE ~ 3
            ))

carseats_num
library(dlookr)
carseats_num %>% 
     correlate() %>% 
     plot()

Correlation (Spearman)

carseats_num %>% 
     correlate(method = "spearman") %>% 
     plot()

Build a model

Split dataset into train/test

set.seed(1234)
carseats_split <- initial_split(carseats, prop = 0.75, strata = High)

carseats_train <- training(carseats_split)
carseats_test <- testing(carseats_split)

Create decision tree classification spec

class_tree_spec <- decision_tree() %>% 
     set_engine("rpart") %>% 
     set_mode("classification")

Fit the decision tree model

class_tree_fit <- fit(class_tree_spec, High ~ ., data = carseats_train)

Visualize our decision tree

class_tree_fit %>% 
     extract_fit_engine() %>% 
     rpart.plot(roundint = FALSE)

Evaluate the model

Confusion matrix (train)

augment(class_tree_fit, new_data = carseats_train) %>% 
     conf_mat(truth = High, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  159  17
##        Yes  18 106
augment(class_tree_fit, new_data = carseats_train) %>% 
     accuracy(truth = High, estimate = .pred_class)

Training accuracy: 88.3%

Confusion matrix (test)

augment(class_tree_fit, new_data = carseats_test) %>%
  conf_mat(truth = High, estimate = .pred_class)
##           Truth
## Prediction No Yes
##        No  39   7
##        Yes 20  34
augment(class_tree_fit, new_data = carseats_test) %>% 
     accuracy(truth = High, estimate = .pred_class)

Testing accuracy: 73% (overfit)

Tuning the model

Let’s try to tune the cost_complexity of the decision tree to find a more optimal complexity. We use the class_tree_spec object and use the set_args() function to specify that we want to tune cost_complexity. This is then passed directly into the workflow object to avoid creating an intermediate object. Also, since the dataset has 400 observations (rows), we’ll apply boostrapping to increase the sample number in each fold

set.seed(1234)
carseats_boot <- bootstraps(carseats_train, times = 100, apparent = TRUE, strata = High)

carseats_boot
tree_spec <- decision_tree(
       cost_complexity = tune(), 
       tree_depth = tune(), 
       min_n = tune()
       ) %>% 
     set_engine("rpart") %>% 
     set_mode("classification")

To be able to tune the variable we need 2 more objects. With the resamples object, we will use a k-fold bootstrap data set, and a grid of values to try. Since we are only tuning 2 hyperparameters it is fine to stay with a regular grid.

Setup parallel processing —-

## [1] 9
tree_grid <- grid_regular(cost_complexity(range = c(-4, -1)), 
                          tree_depth(range = c(3, 7)), 
                          min_n(range = c(10, 20)),
                          levels = 5
                          )

set.seed(2001)
tune_res <- tune_grid(
  tree_spec, 
  High ~ ., 
  resamples = carseats_boot, 
  grid = tree_grid, 
  metrics = metric_set(accuracy)
)

Evaluate the model

tune_res %>% 
     collect_metrics()

Using autoplot() shows which values of cost_complexity appear to produce the highest accuracy.

autoplot(tune_res)

We can now select the best performing value with select_best(), finalize the workflow by updating the value of cost_complexity and fit the model on the full training data set.

# select best model
best_complexity <- select_best(tune_res)

# fit model with best model hyperparameters
class_tree_final <- finalize_model(tree_spec, best_complexity)

# refit training dataset with best model hyperparameters
class_tree_final_fit <- fit(class_tree_final, High ~ ., data = carseats_train)

class_tree_final_fit
## parsnip model object
## 
## n= 300 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 300 123 No (0.59000000 0.41000000)  
##    2) ShelveLoc=Bad,Medium 237  73 No (0.69198312 0.30801688)  
##      4) Advertising< 13.5 203  50 No (0.75369458 0.24630542)  
##        8) Price>=109.5 120  14 No (0.88333333 0.11666667) *
##        9) Price< 109.5 83  36 No (0.56626506 0.43373494)  
##         18) CompPrice< 124.5 65  20 No (0.69230769 0.30769231)  
##           36) Price>=76.5 55  12 No (0.78181818 0.21818182) *
##           37) Price< 76.5 10   2 Yes (0.20000000 0.80000000) *
##         19) CompPrice>=124.5 18   2 Yes (0.11111111 0.88888889) *
##      5) Advertising>=13.5 34  11 Yes (0.32352941 0.67647059)  
##       10) Age>=70.5 7   1 No (0.85714286 0.14285714) *
##       11) Age< 70.5 27   5 Yes (0.18518519 0.81481481) *
##    3) ShelveLoc=Good 63  13 Yes (0.20634921 0.79365079)  
##      6) Price>=135 15   6 No (0.60000000 0.40000000) *
##      7) Price< 135 48   4 Yes (0.08333333 0.91666667) *

Visualize the tuned decision tree (classification)

At last, we can visualize the model, and we see that the better-performing model is less complex than the original model we fit.

class_tree_final_fit %>%
  extract_fit_engine() %>%
  rpart.plot(roundint = FALSE)

Variable importance

The broomstick package (https://github.com/njtierney/broomstick/) enables the analyst to extract the decision tree variable importance from the fitted model.

library(forcats)
broomstick::tidy(class_tree_final_fit$fit) %>% 
     mutate(variable = variable %>% as_factor() %>% fct_rev()) %>% 
     ggplot(aes(y = variable, x = importance)) + 
     geom_col(fill = "steelblue")

Final evaluation

Confusion matrix (train, best model)

augment(class_tree_final_fit, new_data = carseats_train) %>% 
     conf_mat(truth = High, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  164  33
##        Yes  13  90
augment(class_tree_final_fit, new_data = carseats_train) %>% 
     accuracy(truth = High, estimate = .pred_class)

Training accuracy: 86.3%

Confusion matrix (test, best model)

augment(class_tree_final_fit, new_data = carseats_test) %>%
  conf_mat(truth = High, estimate = .pred_class)
##           Truth
## Prediction No Yes
##        No  42  12
##        Yes 17  29
augment(class_tree_final_fit, new_data = carseats_test) %>% 
     accuracy(truth = High, estimate = .pred_class)

Testing accuracy: 71%

8.2 - Fitting Regression Trees

We will now show how we fit a regression tree. This is very similar to what we saw in the last section. The main difference here is that the response we are looking at will be continuous instead of categorical.

Decision Trees (Regression) Explained (StatQuest)

EDA

Let’s plot a histogram for Sales (target)

Carseats %>% 
     ggplot(aes(Sales)) + 
     geom_histogram(fill = "steelblue")

Pearson correlation

Carseats %>% 
     mutate(Urban = ifelse(Urban == "No", 0, 1), 
            US = ifelse(US == "No", 0, 1), 
            ShelveLoc = case_when(
                 ShelveLoc == 'Bad' ~ 1, 
                 ShelveLoc == "Medium" ~ 2, 
                 TRUE ~ 3)
            ) %>% 
     correlate() %>% 
     plot()

Build the regression tree

We can reuse class_tree_spec as a base for the regression decision tree specification.

reg_tree_spec <- class_tree_spec %>% 
     set_mode("regression")

We are using the Carseats dataset. Let’s do the validation split.

set.seed(1010)
carseats_split <- initial_split(Carseats)

carseats_train <- training(carseats_split)
carseats_test <- testing(carseats_split)

Fit the decision tree regression model

reg_tree_fit <- fit(reg_tree_spec, Sales ~ ., data = carseats_train)
reg_tree_fit
## parsnip model object
## 
## n= 300 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 300 2529.534000  7.469333  
##     2) ShelveLoc=Bad,Medium 235 1476.666000  6.695702  
##       4) Price>=124.5 86  393.091600  5.283721  
##         8) CompPrice< 147.5 72  303.087600  4.916528  
##          16) Price>=137.5 27   99.462070  3.735556 *
##          17) Price< 137.5 45  143.374700  5.625111  
##            34) Advertising< 14.5 38   99.190880  5.249211 *
##            35) Advertising>=14.5 7    9.665971  7.665714 *
##         9) CompPrice>=147.5 14   30.370240  7.172143 *
##       5) Price< 124.5 149  813.155300  7.510671  
##        10) Age>=50.5 91  411.524700  6.751758  
##          20) Price>=86.5 80  283.338300  6.353750  
##            40) CompPrice< 123.5 52  157.669400  5.689231  
##              80) Price>=102.5 24   52.750780  4.710833  
##               160) ShelveLoc=Bad 8   12.693000  3.120000 *
##               161) ShelveLoc=Medium 16    9.688775  5.506250 *
##              81) Price< 102.5 28   62.252070  6.527857 *
##            41) CompPrice>=123.5 28   60.061870  7.587857 *
##          21) Price< 86.5 11   23.347450  9.646364 *
##        11) Age< 50.5 58  266.987700  8.701379  
##          22) Income< 59.5 18   65.389180  7.101111 *
##          23) Income>=59.5 40  134.760100  9.421500 *
##     3) ShelveLoc=Good 65  403.719500 10.266310  
##       6) Price>=109.5 42  197.649800  9.155238  
##        12) Price>=142.5 12   36.647220  7.152500 *
##        13) Price< 142.5 30   93.618500  9.956333  
##          26) Age>=61.5 9   17.323960  8.537778 *
##          27) Age< 61.5 21   50.422110 10.564290  
##            54) CompPrice< 129.5 11   14.599670  9.515455 *
##            55) CompPrice>=129.5 10   10.411360 11.718000 *
##       7) Price< 109.5 23   59.542770 12.295220 *

Visualize our decision tree

reg_tree_fit %>% 
     extract_fit_engine() %>% 
     rpart.plot(roundint = FALSE)

Evaluate the model

Collect metrics using augment

augment(reg_tree_fit, new_data = carseats_train) %>%
  rmse(truth = Sales, estimate = .pred)
augment(reg_tree_fit, new_data = carseats_test) %>%
  rmse(truth = Sales, estimate = .pred)

Tuning the regression model

Now let us again try to tune the cost_complexity to find the best performing model.

reg_tree_wf <- workflow() %>% 
     add_model(reg_tree_spec %>% set_args(cost_complexity = tune())) %>% 
     add_formula(Sales ~ .)

Create the bootstrap folds.

set.seed(4321)
carseats_boot <- bootstraps(carseats_train, times = 1000, apparent = TRUE)

carseats_boot

Create the tuning grid.

param_grid <- grid_regular(cost_complexity(range = c(-5, -1)), levels = 10)

tune_res <- tune_grid(
  reg_tree_wf, 
  resamples = carseats_boot, 
  grid = param_grid
)

Evaluate the model

It appears that higher complexity works are to be preferred according to our cross-validation.

autoplot(tune_res)

We select the best-performing model according to "rmse" and fit the final model on the whole training data set.

best_complexity <- select_best(tune_res, metric = "rmse")

reg_tree_final <- finalize_workflow(reg_tree_wf, best_complexity)

reg_tree_final_fit <- fit(reg_tree_final, data = carseats_train)
reg_tree_final_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Formula
## Model: decision_tree()
## 
## -- Preprocessor ----------------------------------------------------------------
## Sales ~ .
## 
## -- Model -----------------------------------------------------------------------
## n= 300 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 300 2529.534000  7.469333  
##     2) ShelveLoc=Bad,Medium 235 1476.666000  6.695702  
##       4) Price>=124.5 86  393.091600  5.283721  
##         8) CompPrice< 147.5 72  303.087600  4.916528  
##          16) Price>=137.5 27   99.462070  3.735556  
##            32) Income< 90 20   74.584220  3.207000  
##              64) Population< 223.5 9   33.593560  2.347778 *
##              65) Population>=223.5 11   28.910000  3.910000 *
##            33) Income>=90 7    3.326371  5.245714 *
##          17) Price< 137.5 45  143.374700  5.625111  
##            34) Advertising< 14.5 38   99.190880  5.249211  
##              68) Population>=380 11   11.129290  4.260909 *
##              69) Population< 380 27   72.940210  5.651852  
##               138) Age>=43 17   36.912490  5.160588 *
##               139) Age< 43 10   24.950210  6.487000 *
##            35) Advertising>=14.5 7    9.665971  7.665714 *
##         9) CompPrice>=147.5 14   30.370240  7.172143 *
##       5) Price< 124.5 149  813.155300  7.510671  
##        10) Age>=50.5 91  411.524700  6.751758  
##          20) Price>=86.5 80  283.338300  6.353750  
##            40) CompPrice< 123.5 52  157.669400  5.689231  
##              80) Price>=102.5 24   52.750780  4.710833  
##               160) ShelveLoc=Bad 8   12.693000  3.120000 *
##               161) ShelveLoc=Medium 16    9.688775  5.506250 *
##              81) Price< 102.5 28   62.252070  6.527857  
##               162) ShelveLoc=Bad 10   30.689160  5.652000 *
##               163) ShelveLoc=Medium 18   19.629840  7.014444 *
##            41) CompPrice>=123.5 28   60.061870  7.587857  
##              82) Advertising< 12.5 21   37.242320  7.114762  
##               164) Price>=110 13   14.217510  6.506154 *
##               165) Price< 110 8   10.384790  8.103750 *
##              83) Advertising>=12.5 7    4.018743  9.007143 *
##          21) Price< 86.5 11   23.347450  9.646364 *
##        11) Age< 50.5 58  266.987700  8.701379  
##          22) Income< 59.5 18   65.389180  7.101111 *
##          23) Income>=59.5 40  134.760100  9.421500  
##            46) Price>=105.5 19   36.968860  8.594211 *
##            47) Price< 105.5 21   73.022200 10.170000  
##              94) ShelveLoc=Bad 7   15.859600  8.660000 *
##              95) ShelveLoc=Medium 14   33.221550 10.925000 *
##     3) ShelveLoc=Good 65  403.719500 10.266310  
##       6) Price>=109.5 42  197.649800  9.155238  
##        12) Price>=142.5 12   36.647220  7.152500 *
##        13) Price< 142.5 30   93.618500  9.956333  
##          26) Age>=61.5 9   17.323960  8.537778 *
## 
## ...
## and 6 more lines.

Visualize the tuned decision tree (regression)

reg_tree_final_fit %>%
     extract_fit_engine() %>%
     rpart.plot(roundint = FALSE)

Variable importance

The broomstick package (https://github.com/njtierney/broomstick/) enables the analyst to extract the decision tree variable importance from the fitted model.

broomstick::tidy(reg_tree_final_fit$fit$fit) %>% 
     mutate(variable = variable %>% as_factor() %>% fct_rev()) %>% 
     ggplot(aes(y = variable, x = importance)) + 
     geom_col(fill = "steelblue")

Collect tuned metrics using augment

augment(reg_tree_final_fit, new_data = carseats_train) %>%
  rmse(truth = Sales, estimate = .pred)
augment(reg_tree_final_fit, new_data = carseats_test) %>%
  rmse(truth = Sales, estimate = .pred)

8.3 - Bagging and Random Forests

Random Forest diagram

Source: https://en.wikipedia.org/wiki/Random_forest#/media/File:Random_forest_diagram_complete.png

Example —

Here we apply bagging and random forests to the Carseats data set. We will be using the randomForest package as the engine. A bagging model is the same as a random forest where mtry is equal to the number of predictors. We can specify the mtry to be .cols() which means that the number of columns in the predictor matrix is used. This is useful if you want to make the specification more general and usable to many different data sets. .cols() is one of many descriptors in the parsnip package. We also set importance = "permutation" in set_engine() to tell the engine to save the information regarding variable importance. This is needed for this engine if we want to use the vip package later.

bagging_spec <- rand_forest(trees = 2000) %>%
     set_engine("ranger", importance = "permutation") %>%
     set_mode("regression")

Fit the model.

bagging_fit <- fit(bagging_spec, Sales ~ ., data = carseats_train)

Evaluate the model

… and we take a look at the testing performance (notice an improvement over the decision tree).

augment(bagging_fit, new_data = carseats_train) %>%
  rmse(truth = Sales, estimate = .pred)
augment(bagging_fit, new_data = carseats_test) %>%
  rmse(truth = Sales, estimate = .pred)

We can also create a quick scatterplot between the true and predicted value to see if we can make any diagnostics.

augment(bagging_fit, new_data = carseats_test) %>% 
     ggplot(aes(Sales, .pred)) + 
     geom_abline() + 
     geom_point(alpha = 0.5)

Variable importance

vip(bagging_fit)

Random Forest using a set of features (mtry)

By default, randomForest() p / 3 variables when building a random forest of regression trees, and sqrt(p) variables when building a random forest of classification trees. Here we use mtry = 6.

rf_spec <- rand_forest(mtry = 6) %>%
     set_engine("ranger", importance = "permutation") %>%
     set_mode("regression")

Fit the model.

rf_fit <- fit(rf_spec, Sales ~ ., data = carseats_train)

Evaluate the model

This model has similar performance compared to the bagging model.

augment(rf_fit, new_data = carseats_train) %>%
  rmse(truth = Sales, estimate = .pred)
augment(rf_fit, new_data = carseats_test) %>%
  rmse(truth = Sales, estimate = .pred)

We can likewise plot the true value against the predicted value.

augment(rf_fit, new_data = carseats_test) %>% 
     ggplot(aes(Sales, .pred)) + 
     geom_abline() + 
     geom_point(alpha = 0.5)

Variable importance

vip(rf_fit)

Boosting

We will now fit a boosted tree model. The xgboost packages give a good implementation of boosted trees. It has many parameters to tune and we know that setting trees too high can lead to overfitting. Nevertheless, let us try fitting a boosted tree. We set tree = 5000 to grow 5000 trees with a maximal depth of 4 by setting tree_depth = 4.

boost_spec <- boost_tree(trees = 2000, tree_depth = 4) %>%
     set_engine("xgboost") %>%
     set_mode("regression")

Fit the model.

boost_fit <- fit(boost_spec, Sales ~ ., data = carseats_train)

Evaluate the model

… and the rmse is a little high in this case which is properly because we didn’t tune any of the parameters.

augment(boost_fit, new_data = carseats_train) %>%
  rmse(truth = Sales, estimate = .pred)
augment(boost_fit, new_data = carseats_test) %>%
  rmse(truth = Sales, estimate = .pred)

Tuning the xgboost regression model

We are using the Carseats dataset. Let’s do the validation split with a different seed.

set.seed(1001)
carseats_split <- initial_split(Carseats)

carseats_train <- training(carseats_split)
carseats_test <- testing(carseats_split)

Create the bootstrap folds.

set.seed(2341)
carseats_boot <- bootstraps(carseats_train, times = 1000, apparent = TRUE, strata = Sales)

carseats_boot

Model specification with hyperparameter tuning

xgb_spec <- 
     boost_tree(
          trees = 2000, 
          mtry = tune(), 
          min_n = tune(), 
          learn_rate = tune()
     ) %>% 
     set_engine("xgboost") %>% 
     set_mode("regression")

Create the workflow()

xgb_wf <- workflow() %>% 
     add_model(xgb_spec) %>% 
     add_formula(Sales ~ .)

Grid tuning with finetune::race_anova()

Tune the xgboost model with race_anova() to accelerate the tuning speed.

library(finetune)

set.seed(4242)

tictoc::tic()
xgb_rs <- 
     tune_race_anova(
     xgb_wf, 
     carseats_boot, 
     grid = 30, 
     control = control_race(verbose_elim = TRUE)
)
tictoc::toc()
## 339.62 sec elapsed
xgb_rs

Evaluate the model

plot_race(xgb_rs)

Show best model

show_best(xgb_rs)

Select best model.

select_best(xgb_rs, "rmse")

Last fit

xgb_last_fit <- 
     xgb_wf %>% 
     finalize_workflow(select_best(xgb_rs, "rmse")) %>% 
     last_fit(carseats_split)

xgb_last_fit

Collect metrics

xgb_last_fit %>% collect_metrics()

Feature importance

xgb_fit <- extract_fit_parsnip(xgb_last_fit)

vip(xgb_fit, geom = "point", num_features = 12)